#coding:utf-8
import networkx as nx
from networkx.algorithms import bipartite
import pandas as pd
import numpy as np
import re
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import os
import cn2an
import chinese2digits as c2d
import operator
import plotly.express as px
import plotly
import plotly.graph_objects as go
import plotly as py
import plotly.offline as offline
offline.init_notebook_mode(connected=True)
from urllib.request import urlopen
import json
import requests
from plotly.subplots import make_subplots
from collections import Counter
from ipynb.fs.full.case_to_graph_only_method import convert_csv_graph, provinces_map
base_url = "/Users/starice/Desktop/total_extracted_result/"
pre_dir = ['type1', 'type2', 'type3', 'type4']
dir_name = ['2014', '2015', '2016', '2017', '2018', '2019', '2020']
dir_sname = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
# csv_graph, _new_csvpd = convert_csv_graph(pre_dir[:], dir_name[:], dir_sname[:])
csv_graph = nx.read_gpickle("/Users/starice/Desktop/csv_graph.gpickle")
# 在后面绘制地区区域划分图会用到
# with open("/Users/starice/OwnFiles/cityu/RA/code/case_process/china_province.geojson") as f:
# provinces_map = json.load(f)
mapbox_access_token = "pk.eyJ1Ijoic3RhcmljZSIsImEiOiJjazN6Y2s5dTUxY2R6M2xxcHllbXk4YWFzIn0.lActFqLzqRWGn7dqr4BShw"
px.set_mapbox_access_token(mapbox_access_token)
plaintiff_titles = ['上诉人', '上诉人(一审原告)', '上诉人(一审第三人)', '上诉人(原告)', '上诉人(原审原告)', '上诉人(原审原告、反诉被告)',
'上诉人(原审原告人)', '上诉人(原审第三人)', '公益诉讼起诉人', '再审申请人', '再审申请人(一审原告)',
'再审申请人(一审原告、二审上诉人)',
'再审申请人(一审原告、二审被上诉人)', '再审申请人(原审原告)',
'再审申请人(原审原告、二审上诉人)', '再审申请人:(一审第三人、二审上诉人)', '原告',
'原告(反诉被告)', '抗诉机关', '支持起诉人',
'支持起诉机关', '申诉人(一审原告、二审上诉人)',
'申诉人(一审原告、二审上诉人、再审申请人)',
'申诉人(一审原告、二审上诉人、原再审申请人)',
'申诉人(一审原告、二审被上诉人)',
'申诉人(原审原告)',
'申请再审人(一审原告、二审上诉人)', '被上诉人(一审原告)', '被上诉人(原审原告)',
'被上诉人(原审原告、反诉被告)', '被上诉人(原甲原告)',
'被上诉人一(原审原告)',
'被上诉人一(原审被告一)',
'被上诉人二(原审被告二)', '被申请人(一审原告、二审上诉人)',
'被申请人(一审原告、二审被上诉人)', '被申请人(原审原告)']
defendant_titles = ['(一审被告、二审被上诉人)', '一审被告', '一审被告(二审上诉人)', '一审被告、二审被上诉人', '一审被告二审上诉人)',
'上上诉人(原审被告)', '上诉人(一审被告)', '上诉人(原审第一被告)', '上诉人(原审被告)',
'上诉人(原审被告、反诉原告)',
'上诉人(原审被告一)', '上诉人(被告)', '公益诉讼出庭人', '再审申请人(一审被告)',
'再审申请人(一审被告、二审上诉人)',
'再审申请人(一审被告、二审被上诉人)', '再审申请人(再审被告)', '再审申请人(原审被告)', '原审当事人(原审被告)',
'原审第三被告',
'原审第二被告',
'原审被告',
'原审被告(反诉原告)', '特别授权被告', '申请再审人(原审被告)',
'第一被告', '第三被告',
'第二被告',
'被上诉人',
'被上诉人(一审被告)', '被上诉人(原审第三人)',
'被上诉人(原审被告)',
'被上诉人(原审被告、反诉原告)',
'被上诉人(原审被告人)', '被告',
'被告(反诉原告)',
'被告一',
'被告二', '被申诉人(一审被告,二审被上诉人)',
'被申诉人(一审被告、二审上诉人)',
'被申诉人(一审被告、二审被上诉人)',
'被申诉人(一审被告、二审被上诉人、再审被申请人)',
'被申诉人(一审被告、二审被上诉人、原再审被申请人)',
'被申诉人(原审被告)',
'被申请人', '被申请人(一审被告)',
'被申请人(一审被告,二审被上诉人)',
'被申请人(一审被告、二审上诉人)',
'被申请人(一审被告、二审被上诉人)', '被申请人(原审被告)', '被申请人(原审被告、二审被上诉人)']
cases = [(n, d) for n, d in csv_graph.nodes(data=True) \
if d['bipartite']==0]
plaintiffs = [(e1, e2, d) for e1, e2, d in csv_graph.edges(nbunch=[n[0] for n in cases], data=True) \
if d['title'] in plaintiff_titles]
defendants = [(e1, e2, d) for e1, e2, d in csv_graph.edges(nbunch=[n[0] for n in cases], data=True) \
if d['title'] in defendant_titles]
#统计活跃原告个人金额和赢率的关系
pd_plaintiffs = pd.DataFrame({"case_id": [i[0] for i in plaintiffs], "plaintiff": [i[1] for i in plaintiffs]})
pd_defendants = pd.DataFrame(({"case_id": [i[0] for i in defendants], "defendant": [i[1] for i in defendants]}))
pd_cases = pd.DataFrame({"case_id": [i[0] for i in cases], \
"judgement_date": [i[1]['judgement_date'] for i in cases], \
"is_success": [i[1]['is_success'] for i in cases], \
# "lat": [i[1]['lat'] for i in cases], \
# "lon": [i[1]['lon'] for i in cases], \
"court_name": [i[1]['court_name'] for i in cases], \
"procedure": [i[1]['procedure'] for i in cases], \
"judge": [i[1]['judge'] for i in cases], \
"legalfee": [i[1]['legalfee'] for i in cases], \
"objectmoney": [i[1]['objectmoney'] for i in cases], \
"province": [i[1]['province'] for i in cases], \
"city": [i[1]['city'] for i in cases], \
"reason": [i[1]['reason'] for i in cases], \
"district": [i[1]['district'] for i in cases], \
"penalty": [i[1]['penalty'] for i in cases]})
pd_cases['year'] = pd.DatetimeIndex(pd_cases['judgement_date']).year
pd_cases['month'] = pd.DatetimeIndex(pd_cases['judgement_date']).month
pd_cases['day'] = pd.DatetimeIndex(pd_cases['judgement_date']).day
all_cases = pd_cases.merge(pd_plaintiffs, on="case_id", how="left")
all_cases = all_cases.merge(pd_defendants, on="case_id", how="left")
all_cases.drop(all_cases[all_cases['plaintiff'].isin(["XX", "xx", "**"])].index, inplace=True) #delete the cases with invalid plaintiff name
all_cases['is_success'].drop_duplicates()
0 TRUE 2 FALSE 11177 unknown Name: is_success, dtype: object
#获取所有一审案件
first_cases = all_cases[all_cases['procedure']=="一审"]
print("一审案件数量: ", len(first_cases['case_id'].drop_duplicates()))
# 获取所有案件原告的节点度并排序
degree_1stplaintiffs = first_cases.groupby("plaintiff")['case_id'].unique().reset_index()
degree_1stplaintiffs['case_count'] = degree_1stplaintiffs['case_id'].apply(lambda r: len(r))
degree_1stplaintiffs.sort_values(by="case_count", inplace=True, ascending=False)
fps_200 = degree_1stplaintiffs[:200]
fps_200
一审案件数量: 32445
| plaintiff | case_id | case_count | |
|---|---|---|---|
| 1482 | 张丙刚 | [5a39bdcee138233419f20ee5, 5ec89df9401b5029842... | 605 |
| 4022 | 贾龙 | [57a953ccc2265c04d1f53ceb, 57a7e3cec2265c04d1e... | 473 |
| 448 | 刘庆生 | [5810d7902f12c631929401b7, 57a96425c2265c28a54... | 438 |
| 3628 | 胡玉宝 | [581211f72f12c63192caff8b, 581030f32f12c631927... | 433 |
| 3490 | 罗伟 | [595ee0c7f75d3f7286d4abf4, 59e30e9de13823d5d95... | 387 |
| ... | ... | ... | ... |
| 4732 | 韩进虎 | [57a924d5c2265c04d1f3cdaa, 57a93e4ac2265c04d1f... | 30 |
| 4816 | 马超 | [5ec834b67116c36f1b9238e4, 5badf514dc9c5352aec... | 30 |
| 104 | 付星 | [57abe374c2265c25898623df, 58110d742f12c631929... | 29 |
| 184 | 何林松 | [5ec930c23cdef0087eb0f276, 5e9a325fe2f4b705184... | 29 |
| 221 | 余定勇 | [58271c3f2f12c65b4adab57d, 588e5ce65a0d8472520... | 29 |
200 rows × 3 columns
def append_gender_birth(row):
row['gender'] = csv_graph.nodes[row['plaintiff']]['gender']
row['birth_date'] = csv_graph.nodes[row['plaintiff']]['birth_date']
return row
fps_200 = fps_200.apply(lambda row: append_gender_birth(row), axis=1)
fps_200['digital_birth'] = fps_200['birth_date'].apply(lambda r: ".".join(re.split('年|月|日',str(r))[:-1]))
fps_200['age'] = fps_200['digital_birth'].apply(lambda r: 2021-int(str(r).split(".")[0]) if str(r)!='' else None)
fps_200.head()
| plaintiff | case_id | case_count | gender | birth_date | digital_birth | age | |
|---|---|---|---|---|---|---|---|
| 1482 | 张丙刚 | [5a39bdcee138233419f20ee5, 5ec89df9401b5029842... | 605 | 男 | 1965年2月5日 | 1965.2.5 | 56.0 |
| 4022 | 贾龙 | [57a953ccc2265c04d1f53ceb, 57a7e3cec2265c04d1e... | 473 | 男 | 1988年4月1日 | 1988.4.1 | 33.0 |
| 448 | 刘庆生 | [5810d7902f12c631929401b7, 57a96425c2265c28a54... | 438 | 男 | 1975年10月11日 | 1975.10.11 | 46.0 |
| 3628 | 胡玉宝 | [581211f72f12c63192caff8b, 581030f32f12c631927... | 433 | 男 | 1986年5月14日 | 1986.5.14 | 35.0 |
| 3490 | 罗伟 | [595ee0c7f75d3f7286d4abf4, 59e30e9de13823d5d95... | 387 | 男 | 1984年10月4日 | 1984.10.4 | 37.0 |
#性别统计
fig = px.bar(
fps_200.groupby('gender')['case_id'].count().reset_index(),
x = 'gender',
y = 'case_id',
title = "Gender Distribution of Active Plaintiffs"
)
fig.show()
#年龄统计
fig = px.histogram(
fps_200,
x = 'age',
nbins = 5,
title = "Age Distribution of Active Plaintiffs"
)
fig.update_layout(bargap=0.2)
fig.show()
new_selected_1stcp = first_cases[first_cases['plaintiff'].isin(fps_200['plaintiff'])]
new_selected_1stcp.groupby(["province", "is_success"])['case_id'].nunique().reset_index()
| province | is_success | case_id | |
|---|---|---|---|
| 0 | 上海市 | FALSE | 17 |
| 1 | 上海市 | TRUE | 188 |
| 2 | 云南省 | TRUE | 10 |
| 3 | 北京市 | FALSE | 143 |
| 4 | 北京市 | TRUE | 1835 |
| 5 | 吉林省 | FALSE | 1 |
| 6 | 四川省 | FALSE | 42 |
| 7 | 四川省 | TRUE | 186 |
| 8 | 天津市 | FALSE | 493 |
| 9 | 天津市 | TRUE | 2496 |
| 10 | 宁夏回族自治区 | FALSE | 2 |
| 11 | 宁夏回族自治区 | TRUE | 29 |
| 12 | 安徽省 | FALSE | 10 |
| 13 | 安徽省 | TRUE | 84 |
| 14 | 山东省 | FALSE | 10 |
| 15 | 山东省 | TRUE | 9 |
| 16 | 广东省 | FALSE | 177 |
| 17 | 广东省 | TRUE | 1194 |
| 18 | 广西壮族自治区 | FALSE | 86 |
| 19 | 广西壮族自治区 | TRUE | 282 |
| 20 | 江苏省 | FALSE | 46 |
| 21 | 江苏省 | TRUE | 99 |
| 22 | 江西省 | TRUE | 2 |
| 23 | 河北省 | TRUE | 6 |
| 24 | 河南省 | FALSE | 2 |
| 25 | 河南省 | TRUE | 47 |
| 26 | 浙江省 | FALSE | 5 |
| 27 | 浙江省 | TRUE | 45 |
| 28 | 海南省 | TRUE | 2 |
| 29 | 湖北省 | FALSE | 37 |
| 30 | 湖北省 | TRUE | 180 |
| 31 | 湖南省 | FALSE | 16 |
| 32 | 湖南省 | TRUE | 13 |
| 33 | 福建省 | FALSE | 220 |
| 34 | 福建省 | TRUE | 6 |
| 35 | 贵州省 | TRUE | 49 |
| 36 | 辽宁省 | FALSE | 68 |
| 37 | 辽宁省 | TRUE | 945 |
| 38 | 重庆市 | FALSE | 757 |
| 39 | 重庆市 | TRUE | 6043 |
| 40 | 陕西省 | FALSE | 1 |
| 41 | 陕西省 | TRUE | 133 |
| 42 | 黑龙江省 | FALSE | 3 |
| 43 | 黑龙江省 | TRUE | 42 |
new_selected_1stcp_chongqing = new_selected_1stcp[new_selected_1stcp['province']=="重庆市"]
gbcq_nselected_1stcp_cc = new_selected_1stcp_chongqing.groupby('year')['case_id'].nunique().reset_index()
gbcq_nselected_1stcp_sc = new_selected_1stcp_chongqing[new_selected_1stcp_chongqing['is_success']=="TRUE"].groupby('year')['case_id'].nunique().reset_index()
gbcq_nselected_1stcp_cc.rename(columns={"case_id": "case_count"}, inplace=True)
gbcq_nselected_1stcp_sc.rename(columns={"case_id": "success_count"}, inplace=True)
gbcq_nselected_1stcp = gbcq_nselected_1stcp_cc.merge(gbcq_nselected_1stcp_sc, how='left')
gbcq_nselected_1stcp.head()
| year | case_count | success_count | |
|---|---|---|---|
| 0 | 2014 | 21 | 21 |
| 1 | 2015 | 141 | 132 |
| 2 | 2016 | 418 | 388 |
| 3 | 2017 | 2524 | 2158 |
| 4 | 2018 | 1876 | 1710 |
fig = px.bar(
gbcq_nselected_1stcp,
x = "year",
y = ["case_count", "success_count"],
title = "The Distribution of cases and success cases related with the first 200 active plaintiffs in Chongqing"
)
fig.update_layout(barmode="group")
fig.show()
new_selected_1stcp_chongqing = new_selected_1stcp[new_selected_1stcp['province']=="重庆市"]
new_selected_1stcp_chongqing['plaintiff'].drop_duplicates()
147 秦东
159 况力彬
198 晏勇
244 强大应
458 周开礼
...
41944 李军
42026 任满仓
43977 王飞
45582 陈浩天
48261 代国海
Name: plaintiff, Length: 63, dtype: object
# 案件数量,胜诉案数量-时间
time_nselected_1stcp_cc = new_selected_1stcp.groupby("year")['case_id'].nunique().reset_index()
time_nselected_1stcp_cc.rename(columns={"case_id": "case_count"}, inplace=True)
time_nselected_1stcp_sc = new_selected_1stcp[new_selected_1stcp['is_success']=="TRUE"].groupby("year")['case_id'].nunique().reset_index()
time_nselected_1stcp_sc.rename(columns={"case_id": "success_count"}, inplace=True)
time_nselected_1stcp = time_nselected_1stcp_cc.merge(time_nselected_1stcp_sc, how="left")
time_nselected_1stcp.fillna(0, inplace=True)
display(time_nselected_1stcp.head())
| year | case_count | success_count | |
|---|---|---|---|
| 0 | 2014 | 147 | 142 |
| 1 | 2015 | 633 | 593 |
| 2 | 2016 | 1639 | 1494 |
| 3 | 2017 | 6798 | 6017 |
| 4 | 2018 | 3539 | 3109 |
# 案件数量,胜诉案数量-时间 画图
fig = px.bar(time_nselected_1stcp, x="year", y=["case_count", "success_count"])
fig.update_layout(barmode='group')
fig.show()
# 案件数量,胜诉案数量-省份
province_nselected_1stcp_cc = new_selected_1stcp.groupby("province")['case_id'].nunique().reset_index()
province_nselected_1stcp_cc.rename(columns={"case_id": "case_count"}, inplace=True)
province_nselected_1stcp_sc = new_selected_1stcp[new_selected_1stcp['is_success']=="TRUE"].groupby("province")['case_id'].nunique().reset_index()
province_nselected_1stcp_sc.rename(columns={"case_id": "success_count"}, inplace=True)
province_nselected_1stcp = province_nselected_1stcp_cc.merge(province_nselected_1stcp_sc, how="left")
province_nselected_1stcp.fillna(0, inplace=True)
display(province_nselected_1stcp.head())
| province | case_count | success_count | |
|---|---|---|---|
| 0 | 上海市 | 205 | 188.0 |
| 1 | 云南省 | 10 | 10.0 |
| 2 | 北京市 | 1978 | 1835.0 |
| 3 | 吉林省 | 1 | 0.0 |
| 4 | 四川省 | 228 | 186.0 |
# 案件数量,胜诉案数量-省份 画图
fig = px.bar(province_nselected_1stcp, x="province", y=["case_count", "success_count"])
fig.update_layout(barmode='group')
fig.show()
# 案件数量,胜诉案,胜率-时间和省份
tp_nselected_1stcp_cc = new_selected_1stcp.groupby(["year", "province"])['case_id'].nunique().reset_index()
tp_nselected_1stcp_cc.rename(columns={"case_id": "case_count"}, inplace=True)
tp_nselected_1stcp_sc = new_selected_1stcp[new_selected_1stcp['is_success']=="TRUE"].groupby(["year", "province"])['case_id'].nunique().reset_index()
tp_nselected_1stcp_sc.rename(columns={"case_id": "success_count"}, inplace=True)
tp_nselected_1stcp = tp_nselected_1stcp_cc.merge(tp_nselected_1stcp_sc, how="left")
tp_nselected_1stcp.fillna(0, inplace=True)
# tp_nselected_1stcp = tp_nselected_1stcp.apply(lambda row: find_lonlat_province(row), axis=1)
tp_nselected_1stcp["year"] = tp_nselected_1stcp["year"].astype(str)
display(tp_nselected_1stcp.head())
| year | province | case_count | success_count | |
|---|---|---|---|---|
| 0 | 2014 | 上海市 | 20 | 17.0 |
| 1 | 2014 | 北京市 | 21 | 21.0 |
| 2 | 2014 | 天津市 | 1 | 1.0 |
| 3 | 2014 | 安徽省 | 10 | 10.0 |
| 4 | 2014 | 广东省 | 40 | 38.0 |
# 为了方便展示趋势,这里将所有数据归一化,只是更改一下缩放的范围,不会改变其分布)
tp_nselected_1stcp['success_rate'] = tp_nselected_1stcp['success_count'] / tp_nselected_1stcp['case_count']
num_list = list(tp_nselected_1stcp['case_count']) + list(tp_nselected_1stcp['success_count'])
print(len(num_list))
half_num_list = len(num_list)/2
amin, amax = min(num_list), max(num_list)
for i, val in enumerate(num_list):
num_list[i] = (val-amin) / (amax-amin)
tp_nselected_1stcp['norm_case_count'] = num_list[:119]
tp_nselected_1stcp['norm_success_count'] = num_list[119:]
tp_nselected_1stcp = tp_nselected_1stcp.sort_values(by=['year', 'province'], ascending=True)
display(tp_nselected_1stcp[tp_nselected_1stcp['province']=="河北省"])
238
| year | province | case_count | success_count | success_rate | norm_case_count | norm_success_count | |
|---|---|---|---|---|---|---|---|
| 22 | 2015 | 河北省 | 3 | 3.0 | 1.0 | 0.001189 | 0.001189 |
| 40 | 2016 | 河北省 | 1 | 1.0 | 1.0 | 0.000396 | 0.000396 |
| 59 | 2017 | 河北省 | 2 | 2.0 | 1.0 | 0.000792 | 0.000792 |
# 胜率和时间省份的图
fig = px.choropleth_mapbox(
data_frame = tp_nselected_1stcp,
geojson = provinces_map,
color = 'success_rate',
locations = "province",
featureidkey = "properties.NL_NAME_1",
color_continuous_scale = px.colors.sequential.Magenta,
center = {"lat": 37.110573, "lon": 106.493924},
animation_frame = "year",
zoom = 3,
hover_data = ['case_count'],
title = "Temporal & Gerographical Distribution of Success Rate"
)
fig.update_layout(height = 800)
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 2000
fig.show()